Artificial Curiosity: Intrinsic Curiosity in Machines too!

1. Import dependencies

In [1]:
import torch
import numpy as np

from environments import SUPPORTED_ENVIRONMENTS, make_environment
from networks import ActorCritic, IntrinsicCuriosityModule
from utils import Recorder, Memory, load_checkpoint

2. Build environments and recorders

In [2]:
print(SUPPORTED_ENVIRONMENTS)
['Pong', 'Breakout', 'SuperMarioBros level 1', 'SuperMarioBros level 2']
In [3]:
recorder_1 = Recorder()
recorder_2 = Recorder()

pong = make_environment('Pong')
breakout = make_environment('Breakout')
mario_level_1 = make_environment('SuperMarioBros level 1')
mario_level_2 = make_environment('SuperMarioBros level 2')

3. Build a random agent

RL framework

source: Richard S. Sutton and Andrew G. Barto, "Reinforcement Learning: An Introduction"

In [4]:
class RandomAgent:
    
    def play(self, environment, max_games=1, max_steps=500, recorder=None):
        
        # Reset environment
        observation = environment.reset()
        
        # Initialize infos and recorder
        n_games, n_steps = 0, 0
        current_game_infos = {'game': 1, 'reward': 0, 'game_duration': 0}
        if recorder is not None:
            recorder.reset()
            recorder.record(environment)

        # Main loop
        while (n_steps < max_steps) and (n_games < max_games):
            
            # Interact with environment
            action = environment.action_space.sample()
            observation, extrinsic_reward, is_game_over, infos = environment.step(action)
            
            # Update infos and recorder
            n_steps += 1
            current_game_infos['reward'] += extrinsic_reward
            current_game_infos['game_duration'] += 1
            if recorder is not None:
                recorder.record(environment)
            
            if is_game_over:
                # Update infos
                n_games += 1
                print(current_game_infos)
                current_game_infos = {'game': n_games + 1, 'reward': 0, 'game_duration': 0}
                # Reset environment
                observation = environment.reset()
        
        # Stop recorder
        if recorder is not None:
            recorder.stop()

4. Run the random agent

In [5]:
random_agent = RandomAgent()
random_agent.play(pong, max_games=1, max_steps=500, recorder=recorder_1)
<NormalizeObservation<StackFrames<PreprocessFrames<CustomPongActionSpace<TimeLimit<AtariEnv<PongNoFrameskip-v4>>>>>>>
Preparing the animation...
In [6]:
recorder_1.replay()
Out[6]:


Once Loop Reflect

5. Build a Smart Agent

Actor Critic architecture

source: Richard S. Sutton and Andrew G. Barto, "Reinforcement Learning: An Introduction"

In [7]:
class ActorCriticAgent:

    def __init__(self, num_actions, checkpoint=None):
        
        # Initialize network, optimizer and memory
        self.network, self.trainable_parameters = self.init_network(num_actions)
        self.optimizer = torch.optim.Adam(self.trainable_parameters, lr=1e-4)
        self.memory = Memory()
        
        # Load pretrained model
        if checkpoint is not None:
            load_checkpoint(self.network, self.optimizer, checkpoint)

    def init_network(self, num_actions):
        
        # Initialize Actor-Critic
        network = {'actor_critic': ActorCritic(num_actions)}
        trainable_parameters = list(network['actor_critic'].parameters())
        return network, trainable_parameters

    def play(self, environment, max_games=1, max_steps=500, train=False, verbose=False, recorder=None):
        
        # Reset environment
        observation = environment.reset()
        
        # Initialize infos and recorder
        n_steps = 0
        n_games = 0
        current_game_infos = {'game': n_games + 1, 'reward': 0, 'game_duration': 0}
        if recorder is not None:
            recorder.reset()
            recorder.record(environment)

        # Main loop
        while (n_steps < max_steps) and (n_games < max_games):
            
            # Reset memory
            self.init_rollout(observation)
            
            for rollout_step in range(20):
                
                # Interact with environment
                value, log_policy, action = self.network['actor_critic'](observation)
                self.memory.append({'value': value, 'log_policy': log_policy, 'action': action})
                
                observation, extrinsic_reward, is_game_over, infos = environment.step(action.numpy()[0])
                
                reward = self.get_reward(observation, extrinsic_reward)
                self.memory.append({'reward': reward})

                # Update infos and recorder
                n_steps += 1
                current_game_infos['reward'] += extrinsic_reward
                current_game_infos['game_duration'] += 1
                if recorder is not None:
                    recorder.record(environment)

                if is_game_over:
                    # Update infos
                    n_games += 1
                    print(current_game_infos)
                    current_game_infos = {'game': n_games + 1, 'reward': 0, 'game_duration': 0}
                    # Reset environment
                    observation = environment.reset()
                    # Interrupt rollout
                    break
            
            self.end_rollout(observation, is_game_over)
            if verbose:
                print(current_game_infos)
            
            if train:
                # Update neural network
                loss = self.compute_loss()
                self.backpropagate(loss)

        if recorder is not None: recorder.stop()

    def init_rollout(self, observation):
        self.memory.reset()
        self.network['actor_critic'].detach_internal_state()

    def end_rollout(self, observation, is_game_over):
        if is_game_over:
            next_value = torch.Tensor([[0]])
            self.network['actor_critic'].reset_internal_state()
        else:
            next_value = self.network['actor_critic'](observation)[0].detach()
        self.memory.append({'value': next_value})

    def get_reward(self, observation, extrinsic_reward):
        return np.clip(extrinsic_reward, -1, 1)

    def compute_loss(self):
        return self.network['actor_critic'].loss(self.memory)

    def backpropagate(self, loss, max_gradient_norm=40):
        self.optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(self.trainable_parameters, max_gradient_norm)
        self.optimizer.step()

6. Run the Smart Agent

In [8]:
smart_mario_agent = ActorCriticAgent(num_actions=mario_level_1.action_space.n)
In [9]:
smart_mario_agent.play(mario_level_1, max_games=3, max_steps=500, verbose=True, recorder=recorder_1)
{'game': 1, 'reward': 0.6666666666666667, 'game_duration': 20}
{'game': 1, 'reward': -0.7999999999999999, 'game_duration': 40}
{'game': 1, 'reward': -1.9999999999999998, 'game_duration': 60}
{'game': 1, 'reward': -1.6000000000000005, 'game_duration': 80}
{'game': 1, 'reward': -2.8666666666666685, 'game_duration': 100}
{'game': 1, 'reward': -4.066666666666668, 'game_duration': 120}
{'game': 1, 'reward': 1.1999999999999986, 'game_duration': 140}
{'game': 1, 'reward': 6.133333333333332, 'game_duration': 160}
{'game': 1, 'reward': 9.133333333333335, 'game_duration': 180}
{'game': 1, 'reward': 5.799999999999999, 'game_duration': 200}
{'game': 1, 'reward': 3.9999999999999973, 'game_duration': 220}
{'game': 1, 'reward': 4.999999999999997, 'game_duration': 240}
{'game': 1, 'reward': 6.266666666666667, 'game_duration': 260}
{'game': 1, 'reward': 13.133333333333338, 'game_duration': 280}
{'game': 1, 'reward': 15.600000000000005, 'game_duration': 300}
{'game': 1, 'reward': 21.933333333333316, 'game_duration': 320}
{'game': 1, 'reward': 19.666666666666657, 'game_duration': 340}
{'game': 1, 'reward': 14.800000000000004, 'game_duration': 360}
{'game': 1, 'reward': 16.866666666666667, 'game_duration': 380}
{'game': 1, 'reward': 20.73333333333332, 'game_duration': 400}
{'game': 1, 'reward': 20.19999999999999, 'game_duration': 420}
{'game': 1, 'reward': 15.333333333333336, 'game_duration': 440}
{'game': 1, 'reward': 18.933333333333326, 'game_duration': 460}
{'game': 1, 'reward': 15.133333333333335, 'game_duration': 480}
{'game': 1, 'reward': 14.866666666666669, 'game_duration': 500}
Preparing the animation...
In [10]:
recorder_1.replay()
Out[10]:


Once Loop Reflect
In [11]:
smart_mario_agent.play(mario_level_1, max_games=10, max_steps=10000, train=True)
{'game': 1, 'reward': 17.400000000000016, 'game_duration': 2005}
{'game': 2, 'reward': 38.999999999999986, 'game_duration': 245}
{'game': 3, 'reward': 47.46666666666664, 'game_duration': 350}
{'game': 4, 'reward': 36.66666666666612, 'game_duration': 1430}
{'game': 5, 'reward': 16.400000000000006, 'game_duration': 38}
{'game': 6, 'reward': 41.86666666666666, 'game_duration': 187}
{'game': 7, 'reward': 49.39999999999984, 'game_duration': 508}
{'game': 8, 'reward': 40.733333333333285, 'game_duration': 172}
{'game': 9, 'reward': 14.600000000000005, 'game_duration': 43}
{'game': 10, 'reward': 48.999999999999936, 'game_duration': 220}
In [12]:
smart_mario_agent.play(mario_level_1, max_games=3, max_steps=500, recorder=recorder_1)
{'game': 1, 'reward': 15.26666666666667, 'game_duration': 53}
{'game': 2, 'reward': 15.866666666666667, 'game_duration': 38}
{'game': 3, 'reward': 38.79999999999998, 'game_duration': 115}
Preparing the animation...
In [13]:
recorder_1.replay()
Out[13]:


Once Loop Reflect
In [14]:
smart_mario_agent = ActorCriticAgent(num_actions=mario_level_1.action_space.n,
                                     checkpoint='models/smart_mario_agent_4M.tar')
smart_mario_agent.play(mario_level_1, max_games=3, max_steps=500, recorder=recorder_1)
{'game': 1, 'reward': 204.4000000000004, 'game_duration': 272}
Preparing the animation...
In [15]:
recorder_1.replay()
Out[15]:


Once Loop Reflect

7. Build a Curious Agent

Intrinsic Curiosity Module architecture

source: Pathak et al., "Curiosity driven Exploration by Self-supervision Prediction" (2017)

In [16]:
class CuriousActorCriticAgent(ActorCriticAgent):

    def init_network(self, num_actions):
        network, trainable_parameters = super().init_network(num_actions)
        # Initialize Intrinsic Curiosity Module
        network['icm'] = IntrinsicCuriosityModule(num_actions)
        trainable_parameters += list(network['icm'].parameters())
        return network, trainable_parameters

    def init_rollout(self, observation):
        super().init_rollout(observation)
        # Encode the observation into features
        features = self.network['icm'].observation_encoder(observation)
        self.memory.append({'features': features})

    def end_rollout(self, observation, is_game_over):
        # Ignore information about the end of the game
        next_value = self.network['actor_critic'](observation)[0].detach()
        self.memory.append({'value': next_value})

    def get_reward(self, observation, extrinsic_reward):
        
        # Retrieve features and action from the previous step
        last_features = self.memory.get_last('features')
        last_action = self.memory.get_last('action')
        # Encode the observation into features
        features = self.network['icm'].observation_encoder(observation)
        
        # Try to find by yourself the inputs and outputs of these neural networks:
        predicted_features = self.network['icm'].forward_model(last_features, last_action)
        predicted_action = self.network['icm'].inverse_model(last_features, features)
        
        self.memory.append({'features': features,
                            'predicted_features': predicted_features,
                            'predicted_action': predicted_action})
        
        # Try to find by yourself the inputs of the curiosity function:
        intrinsic_reward = self.network['icm'].curiosity(predicted_features, features)
        return np.clip(intrinsic_reward, -1, 1)

    def compute_loss(self):
        loss = super().compute_loss()
        # Add the ICM loss
        loss += self.network['icm'].loss(self.memory)
        return loss
In [17]:
curious_mario_agent = CuriousActorCriticAgent(num_actions=mario_level_1.action_space.n, 
                                              checkpoint='models/curious_mario_agent_4M.tar')
curious_mario_agent.play(mario_level_1, max_games=5, max_steps=1000, recorder=recorder_1)
{'game': 1, 'reward': 14.933333333333337, 'game_duration': 42}
{'game': 2, 'reward': 14.866666666666674, 'game_duration': 44}
{'game': 3, 'reward': 89.06666666666672, 'game_duration': 191}
{'game': 4, 'reward': 69.39999999999998, 'game_duration': 138}
{'game': 5, 'reward': 15.066666666666668, 'game_duration': 38}
Preparing the animation...
In [18]:
recorder_1.replay()
Out[18]:


Once Loop Reflect
In [19]:
smart_mario_agent = ActorCriticAgent(num_actions=mario_level_1.action_space.n, 
                                     checkpoint='models/smart_mario_agent_4M.tar')
smart_mario_agent.play(mario_level_2, max_games=10, max_steps=5000, recorder=recorder_1, train=True)

curious_mario_agent = CuriousActorCriticAgent(num_actions=mario_level_1.action_space.n, 
                                              checkpoint='models/curious_mario_agent_4M.tar')
curious_mario_agent.play(mario_level_2, max_games=10, max_steps=5000, recorder=recorder_2, train=True)
{'game': 1, 'reward': 9.066666666666666, 'game_duration': 19}
{'game': 2, 'reward': 9.066666666666666, 'game_duration': 19}
{'game': 3, 'reward': 9.066666666666666, 'game_duration': 19}
{'game': 4, 'reward': 25.133333333333358, 'game_duration': 250}
{'game': 5, 'reward': 8.066666666666666, 'game_duration': 23}
{'game': 6, 'reward': 8.266666666666667, 'game_duration': 22}
{'game': 7, 'reward': 26.466666666666676, 'game_duration': 146}
{'game': 8, 'reward': 38.533333333333346, 'game_duration': 121}
{'game': 9, 'reward': 26.86666666666668, 'game_duration': 120}
{'game': 10, 'reward': 8.733333333333333, 'game_duration': 19}
Preparing the animation...
{'game': 1, 'reward': 38.266666666666644, 'game_duration': 185}
{'game': 2, 'reward': 8.133333333333333, 'game_duration': 23}
{'game': 3, 'reward': 8.733333333333334, 'game_duration': 20}
{'game': 4, 'reward': 8.600000000000001, 'game_duration': 21}
{'game': 5, 'reward': 7.999999999999999, 'game_duration': 24}
{'game': 6, 'reward': 8.266666666666667, 'game_duration': 22}
{'game': 7, 'reward': 7.066666666666667, 'game_duration': 29}
{'game': 8, 'reward': 53.46666666666668, 'game_duration': 142}
{'game': 9, 'reward': 37.46666666666663, 'game_duration': 176}
{'game': 10, 'reward': 6.533333333333333, 'game_duration': 32}
Preparing the animation...
In [20]:
recorder_1.replay()
Out[20]:


Once Loop Reflect
In [21]:
recorder_2.replay()
Out[21]:


Once Loop Reflect
In [ ]: